

### -------------------------------------------------------- ###
### ---- 0) Function: Alter gammaCKpar() Source Code to ---- 
###         fix Windows parallelizing code
### -------------------------------------------------------- ###

# copied from gammaCKpar source code, with noted alterations below

gammaCKpar_JBC <- function (matAp, matBp, n.cores = NULL, cut.a = 0.92, cut.p = 0.88, method = "jw", w = 0.1) 
{
  if (any(class(matAp) %in% c("tbl_df", "data.table"))) {
    matAp <- as.data.frame(matAp)[, 1]
  }
  if (any(class(matBp) %in% c("tbl_df", "data.table"))) {
    matBp <- as.data.frame(matBp)[, 1]
  }
  matAp[matAp == ""] <- NA
  matBp[matBp == ""] <- NA
  if (sum(is.na(matAp)) == length(matAp) | length(unique(matAp)) == 
      1) {
    cat("WARNING: You have no variation in this variable, or all observations are missing in dataset A.\n")
  }
  if (sum(is.na(matBp)) == length(matBp) | length(unique(matBp)) == 
      1) {
    cat("WARNING: You have no variation in this variable, or all observations are missing in dataset B.\n")
  }
  if (!(method %in% c("jw", "jaro", "lv"))) {
    stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
  }
  if (method == "jw" & !is.null(w)) {
    if (w < 0 | w > 0.25) {
      stop("Invalid value provided for w. Remember, w in [0, 0.25].")
    }
  }
  if (is.null(n.cores)) {
    n.cores <- detectCores() - 1
  }
  matrix.1 <- as.matrix(as.character(matAp))
  matrix.2 <- as.matrix(as.character(matBp))
  matrix.1[is.na(matrix.1)] <- "9999"
  matrix.2[is.na(matrix.2)] <- "9998"
  u.values.1 <- unique(matrix.1)
  u.values.2 <- unique(matrix.2)
  n.slices1 <- max(round(length(u.values.1)/(4500), 0), 1)
  n.slices2 <- max(round(length(u.values.2)/(4500), 0), 1)
  limit.1 <- round(quantile((0:nrow(u.values.2)), p = seq(0, 
                                                          1, 1/n.slices2)), 0)
  limit.2 <- round(quantile((0:nrow(u.values.1)), p = seq(0, 
                                                          1, 1/n.slices1)), 0)
  n.cores <- min(n.cores, n.slices1 * n.slices2)
  temp.1 <- temp.2 <- list()
  for (i in 1:n.slices2) {
    temp.1[[i]] <- list(u.values.2[(limit.1[i] + 1):limit.1[i + 
                                                              1]], limit.1[i])
  }
  for (i in 1:n.slices1) {
    temp.2[[i]] <- list(u.values.1[(limit.2[i] + 1):limit.2[i + 
                                                              1]], limit.2[i])
  }
  stringvec <- function(m, y, cut, strdist = method, p1 = w) {
    x <- as.matrix(m[[1]])
    e <- as.matrix(y[[1]])
    if (strdist == "jw") {
      t <- 1 - stringdistmatrix(e, x, method = "jw", 
                                p = p1, nthread = 1)
      t[t < cut[[2]]] <- 0
      t <- Matrix(t, sparse = T)
    }
    if (strdist == "jaro") {
      t <- 1 - stringdistmatrix(e, x, method = "jw", 
                                nthread = 1)
      t[t < cut[[2]]] <- 0
      t <- Matrix(t, sparse = T)
    }
    if (strdist == "lv") {
      t <- stringdistmatrix(e, x, method = method, nthread = 1)
      t.1 <- nchar(as.matrix(e))
      t.2 <- nchar(as.matrix(x))
      o <- t(apply(t.1, 1, function(w) {
        ifelse(w >= t.2, w, t.2)
      }))
      t <- 1 - t * (1/o)
      t[t < cut[[2]]] <- 0
      t <- Matrix(t, sparse = T)
    }
    t@x[t@x >= cut[1]] <- 2
    t@x[t@x >= cut[2] & t@x < cut[1]] <- 1
    gc()
    slice.1 <- m[[2]]
    slice.2 <- y[[2]]
    indexes.2 <- which(t == 2, arr.ind = T)
    indexes.2[, 1] <- indexes.2[, 1] + slice.2
    indexes.2[, 2] <- indexes.2[, 2] + slice.1
    indexes.1 <- which(t == 1, arr.ind = T)
    indexes.1[, 1] <- indexes.1[, 1] + slice.2
    indexes.1[, 2] <- indexes.1[, 2] + slice.1
    list(indexes.2, indexes.1)
  }
  do <- expand.grid(1:n.slices2, 1:n.slices1)
  if (n.cores == 1) {
    "%oper%" <- foreach::"%do%"
  } else {
    "%oper%" <- foreach::"%dopar%"
    cl <- makeCluster(n.cores)
    registerDoParallel(cl)
    on.exit(stopCluster(cl))
  }
  temp.f <- foreach(i = 1:nrow(do), .packages = c("stringdist", 
                                                  "Matrix")) %oper% {
                                                    r1 <- do[i, 1]
                                                    r2 <- do[i, 2]
                                                    stringvec(temp.1[[r1]], temp.2[[r2]], c(cut.a, cut.p))
                                                  }
  gc()
  reshape2 <- function(s) {
    s[[1]]
  }
  reshape1 <- function(s) {
    s[[2]]
  }
  temp.2 <- lapply(temp.f, reshape2)
  temp.1 <- lapply(temp.f, reshape1)
  indexes.2 <- do.call("rbind", temp.2)
  indexes.1 <- do.call("rbind", temp.1)
  ht1 <- new.env(hash = TRUE)
  ht2 <- new.env(hash = TRUE)
  n.values.2 <- as.matrix(cbind(u.values.1[indexes.2[, 1]], 
                                u.values.2[indexes.2[, 2]]))
  n.values.1 <- as.matrix(cbind(u.values.1[indexes.1[, 1]], 
                                u.values.2[indexes.1[, 2]]))
  matches.2 <- lapply(seq_len(nrow(n.values.2)), function(i) n.values.2[i, 
                                                                        ])
  matches.1 <- lapply(seq_len(nrow(n.values.1)), function(i) n.values.1[i, 
                                                                        ])
  if (Sys.info()[["sysname"]] == "Windows") {
    if (n.cores == 1) 
      "%oper%" <- foreach::"%do%"
    else {
      "%oper%" <- foreach::"%dopar%"
      cl <- makeCluster(n.cores)
      registerDoParallel(cl)
      on.exit(stopCluster(cl)) #JBC ADDED
    }
    if (length(matches.2) > 0) {
      final.list2 <- foreach(i = 1:length(matches.2)) %oper% 
        {
          ht1 <- which(matrix.1 == matches.2[[i]][[1]])
          ht2 <- which(matrix.2 == matches.2[[i]][[2]])
          list(ht1, ht2)
        }
    }
    if (length(matches.1) > 0) {
      final.list1 <- foreach(i = 1:length(matches.1)) %oper% 
        {
          ht1 <- which(matrix.1 == matches.1[[i]][[1]])
          ht2 <- which(matrix.2 == matches.1[[i]][[2]])
          list(ht1, ht2)
        }
    }
    # if (n.cores > 1) { # JBC REMOVED
    #   stopCluster(cl)
    # }
  } else {
    no_cores <- n.cores
    final.list2 <- mclapply(matches.2, function(s) {
      ht1 <- which(matrix.1 == s[1])
      ht2 <- which(matrix.2 == s[2])
      list(ht1, ht2)
    }, mc.cores = getOption("mc.cores", no_cores))
    final.list1 <- mclapply(matches.1, function(s) {
      ht1 <- which(matrix.1 == s[1])
      ht2 <- which(matrix.2 == s[2])
      list(ht1, ht2)
    }, mc.cores = getOption("mc.cores", no_cores))
  }
  if (length(matches.2) == 0) {
    final.list2 <- list()
    warning("There are no identical (or nearly identical) matches. We suggest either changing the value of cut.p")
  }
  if (length(matches.1) == 0) {
    final.list1 <- list()
    warning("There are no partial matches. We suggest either changing the value of cut.p or using gammaCK2par() instead")
  }
  na.list <- list()
  na.list[[1]] <- which(matrix.1 == "9999")
  na.list[[2]] <- which(matrix.2 == "9998")
  out <- list()
  out[["matches2"]] <- final.list2
  out[["matches1"]] <- final.list1
  out[["nas"]] <- na.list
  class(out) <- c("fastLink", "gammaCKpar")
  return(out)
}


### -------------------------------------------------------- ###
### ---- 1) Function: Alter fastLink Source Code to ---- 
###         Run and Save Output (by parts)
### -------------------------------------------------------- ###
#copied from fastlink source code, with noted alterations below
#mostly altered to save out partial steps in case of crash

#' fastLink
#'
#' Run the fastLink algorithm to probabilistically match
#' two datasets.
#'
#' @usage fastLink(dfA, dfB, varnames, stringdist.match,
#' stringdist.method, numeric.match, partial.match,
#' cut.a, cut.p, jw.weight,
#' cut.a.num, cut.p.num,
#' priors.obj, w.lambda, w.pi,
#' address.field, gender.field, estimate.only, em.obj,
#' dedupe.matches, linprog.dedupe,
#' reweight.names, firstname.field, cond.indep,
#' n.cores, tol.em, threshold.match, return.all, return.df, verbose)
#'
#' @param dfA Dataset A - to be matched to Dataset B
#' @param dfB Dataset B - to be matched to Dataset A
#' @param varnames A vector of variable names to use for matching.
#' Must be present in both dfA and dfB
#' @param stringdist.match A vector of variable names indicating
#' which variables should use string distance matching. Must be a subset of
#' 'varnames' and must not be present in 'numeric.match'.
#' @param stringdist.method String distance method for calculating similarity, options are: "jw" Jaro-Winkler (Default), "jaro" Jaro, and "lv" Edit
#' @param numeric.match A vector of variable names indicating which variables should use numeric matching.
#' Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
#' @param partial.match A vector of variable names indicating whether to include
#' a partial matching category for the string distances. Must be a subset of 'varnames'
#' and 'stringdist.match'.
#' @param cut.a Lower bound for full string-distance match, ranging between 0 and 1. Default is 0.94
#' @param cut.p Lower bound for partial string-distance match, ranging between 0 and 1. Default is 0.88
#' @param jw.weight Parameter that describes the importance of the first characters of a string (only needed if stringdist.method = "jw"). Default is .10
#' @param cut.a.num Lower bound for full numeric match. Default is 1
#' @param cut.p.num Lower bound for partial numeric match. Default is 2.5
#' @param priors.obj A list containing priors for auxiliary movers information,
#' as output from calcMoversPriors(). Default is NULL
#' @param w.lambda How much weight to give the prior on lambda versus the data. Must range between 0 (no weight on prior) and 1 (weight fully on prior).
#' Default is NULL (no prior information provided).
#' @param w.pi How much weight to give the prior on pi versus the data. Must range between 0 (no weight on prior) and 1 (weight fully on prior).
#' Default is NULL (no prior information provided).
#' @param address.field The name of the address field. To be used when 'pi.prior' is included in 'priors.obj'.
#' Default is NULL (no matching variables should have address prior applied). Must be present in 'varnames'.
#' @param gender.field The name of the field indicating gender. If provided, the exact-matching gender prior is used in the EM algorithm.
#' Default is NULL (do not implement exact matching on gender). Must be present in 'varnames'.
#' @param estimate.only Whether to stop running the algorithm after the EM step (omitting getting the matched indices of dataset A and dataset B).
#' Only the EM object will be returned. Can be used when running the match on a random sample and applying to a larger dataset, or for out-of-sample
#' prediction of matches. Default is FALSE.
#' @param em.obj An EM object from a prior run of 'fastLink' or 'emlinkMARmov'. Parameter estimates will be applied to the matching patterns
#' in 'dfA' and 'dfB'. If provided. 'estimate.only' is set to FALSE. Often provided when parameters have been
#' estimated on a smaller sample, and the user wants to apply them to the full dataset. Default is NULL (EM will be estimated from matching patterns in 'dfA' and 'dfB').
#' @param dedupe.matches Whether to dedupe the set of matches returned by the algorithm. Default is TRUE.
#' @param linprog.dedupe If deduping matches, whether to use Winkler's linear programming solution to dedupe. Default is FALSE.
#' @param reweight.names Whether to reweight the posterior match probabilities by the frequency of individual first names. Default is FALSE.
#' @param firstname.field The name of the field indicating first name. Must be provided if reweight.names = TRUE.
#' @param cond.indep Estimates for the parameters of interest are obtained from the Fellegi-Sunter model under conditional independence. Default is TRUE. 
#' If set to FALSE parameters estimates are obtained from a model that allows for dependencies across linkage fields.
#' @param n.cores Number of cores to parallelize over. Default is NULL.
#' @param tol.em Convergence tolerance for the EM Algorithm. Default is 1e-04.
#' @param threshold.match A number between 0 and 1 indicating either the lower bound (if only one number provided) or the range of certainty that the
#' user wants to declare a match. For instance, threshold.match = .85 will return all pairs with posterior probability greater than .85 as matches,
#' while threshold.match = c(.85, .95) will return all pairs with posterior probability between .85 and .95 as matches.
#' @param return.all Whether to return the most likely match for each observation in dfA and dfB. Overrides user setting of \code{threshold.match} by setting
#' \code{threshold.match} to 0.0001, and automatically dedupes all matches. Default is TRUE.
#' @param return.df Whether to return the entire dataframe of dfA and dfB instead of just the indices. Default is FALSE.
#' @param verbose Whether to print elapsed time for each step. Default is FALSE.
#'
#' @return \code{fastLink} returns a list of class 'fastLink' containing the following components if calculating matches:
#' \item{matches}{An nmatches X 2 matrix containing the indices of the successful matches in \code{dfA}
#' in the first column, and the indices of the corresponding successful matches in \code{dfB} in the
#' second column.}
#' \item{EM}{A list with the output of the EM algorithm, which contains the exact matching
#' patterns and the associated posterior probabilities of a match for each matching pattern.}
#' \item{patterns}{A matrix with the observed matching patterns for each successfully matched pair.}
#' \item{nobs.a}{The number of observations in dataset A.}
#' \item{nobs.b}{The number of observations in dataset B.}
#' \item{zeta.name}{If reweighting by name, the posterior probability of a match for each match in dataset A and B.}
#' 
#' If only running the EM and not returning the matched indices, \code{fastLink} only returns the EM object.
#'
#' @author Ted Enamorado <ted.enamorado@gmail.com>, Ben Fifield <benfifield@gmail.com>, and Kosuke Imai
#'
#' @examples
#' \dontrun{
#' fl.out <- fastLink(dfA, dfB,
#' varnames = c("firstname", "lastname", "streetname", "birthyear"),
#' n.cores = 1)
#' }
#' @export
fastLink_JBC <- function(dfA, dfB, varnames,
                     stringdist.match = NULL, 
                     stringdist.method = "jw",
                     numeric.match = NULL, 
                     partial.match = NULL,
                     cut.a = 0.94, cut.p = 0.88,
                     jw.weight = .10,
                     cut.a.num = 1, cut.p.num = 2.5,
                     priors.obj = NULL,
                     w.lambda = NULL, w.pi = NULL, address.field = NULL,
                     gender.field = NULL, estimate.only = FALSE, em.obj = NULL,
                     dedupe.matches = TRUE, linprog.dedupe = FALSE,
                     reweight.names = FALSE, firstname.field = NULL, cond.indep = TRUE,
                     n.cores = NULL, tol.em = 1e-04, threshold.match = 0.85,
                     return.all = FALSE, return.df = FALSE, verbose = FALSE,
                     file_path_root, ABB, attempt_name # JBC ADDED!!!
                     ){
  
  ### --- JBC ADDED!!! --- 
  # set file paths
  file_path_data_matched <- paste0(file_path_root,"/Temp_Data") # Data Folder 
  temp_path_o <- paste0(file_path_root,"/Output_EM/fl_out_") # Summary Output
  temp_path_d <- paste0(file_path_data_matched,"/State_VF_Matches/Matched_") # Matched Data
  ### ---
  
  cat("\n")
  cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n"))
  cat("fastLink(): Fast Probabilistic Record Linkage\n")
  cat(c(paste(rep("=", 20), sep = "", collapse = ""), "\n\n"))
  
  ## --------------------------------------
  ## Process inputs and stop if not correct
  ## --------------------------------------
  if(any(class(dfA) %in% c("tbl_df", "data.table"))){
    dfA <- as.data.frame(dfA)
  }
  if(any(class(dfB) %in% c("tbl_df", "data.table"))){
    dfB <- as.data.frame(dfB)
  }
  if(any(!(varnames %in% names(dfA)))){
    stop("Some variables in varnames are not present in dfA.")
  }
  if(any(!(varnames %in% names(dfB)))){
    stop("Some variables in varnames are not present in dfB.")
  }
  if(any(!(stringdist.match %in% varnames))){
    stop("You have provided a variable name for stringdist.match that is not in 'varnames'.")
  }
  if(any(!(numeric.match %in% varnames))){
    stop("You have provided a variable name for numeric.match that is not in 'varnames'.")
  }
  if(length(intersect(numeric.match, stringdist.match)) > 0){
    stop("There is a variable present in both 'numeric.match' and 'stringdist.match'. Please select only one matching metric for each variable.")
  }
  if(is.null(numeric.match)) {
    if (any(!(partial.match %in% varnames)) | any(!(partial.match %in% 
                                                    stringdist.match))) {
      stop("You have provided a variable name for 'partial.match' that is not present in either 'varnames', 'numeric.match', or 'stringdist.match'.")
    }
  } else {
    if (any(!(partial.match %in% varnames)) | any(!(partial.match %in% unique(c(stringdist.match, numeric.match))))) {
      stop("You have provided a variable name for 'partial.match' that is not present in either 'varnames', 'numeric.match', or 'stringdist.match'.")
    }
  }    
  if(!is.null(address.field)){
    if(length(address.field) > 1 | length(gender.field) > 1){
      stop("'address.field' must have at most only one variable name.")
    }
    if(!(address.field %in% varnames)){
      stop("You have provided a variable name for 'address.field' that is not in 'varnames'.")
    }
  }
  if(!is.null(gender.field)){
    if(length(gender.field) > 1){
      stop("'gender.field' must have at most one variable name.")
    }
    if(!(gender.field %in% varnames)){
      stop("You have provided a variable name for 'gender.field' that is not in 'varnames'.")
    }
  }
  if(reweight.names == TRUE & is.null(firstname.field)){
    stop("If reweighting the match probability by first name, you must provide the name of the field representing first name.")
  }
  if(!is.null(firstname.field)){
    if(length(firstname.field) > 1){
      stop("'firstname.field' must have at most one variable name.")
    }
    if(!(firstname.field %in% varnames)){
      stop("You have provided a variable name for 'firstname.field' that is not in 'varnames'.")
    }
  }
  if(!is.null(em.obj)){
    if(!("fastLink.EM" %in% class(em.obj))){
      stop("If providing an EM object, it must be of class 'fastLink.EM'.")
    }
  }
  if(!is.null(em.obj) & estimate.only){
    estimate.only <- FALSE
    cat("You have provided an EM object but have set 'estimate.only' to TRUE. Setting 'estimate.only' to FALSE so that matched indices are returned.\n")
  }
  if(!(stringdist.method %in% c("jw", "jaro", "lv"))){
    stop("Invalid string distance method. Method should be one of 'jw', 'jaro', or 'lv'.")
  }
  if(stringdist.method == "jw" & !is.null(jw.weight)){
    if(jw.weight < 0 | jw.weight > 0.25){
      stop("Invalid value provided for jw.weight. Remember, jw.weight in [0, 0.25].")
    }
  }
  if(return.all){
    threshold.match <- 0.001
    if(!dedupe.matches){
      cat("You have specified that all matches be returned but are not deduping the matches. The resulting object may be very large.\n")
    }
  }else{
    cat("If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.\n")
  }
  if(!is.null(priors.obj) & cond.indep == FALSE){
    cat("The current implementation of fastLink can only incorporate prior information under the conditionally independent model. Ignoring prior information in estimation.")
    priors.obj <- NULL
    w.lambda <- NULL
    w.pi <- NULL
    address.field <- NULL
    gender.field <- NULL
  }
  
  ## Check class of numeric indicators
  classA <- lapply(dfA[,varnames], class)
  classB <- lapply(dfB[,varnames], class)
  if(any(unlist(classA)[names(classA) %in% numeric.match] != "numeric") |
     any(unlist(classB)[names(classB) %in% numeric.match] != "numeric")){
    stop("You have specified that a variable be compared using numeric matching, but that variable is not of class 'numeric'. Please check your variable classes.")
  }
  
  ## Check if data frames are identical
  dedupe.df <- FALSE
  if(identical(dfA, dfB)){
    cat("dfA and dfB are identical, assuming deduplication of a single data set.\nSetting return.all to FALSE.\n\n")
    dedupe.matches <- FALSE
    return.all <- FALSE
    dedupe.df <- TRUE
  }
  
  ## Create boolean indicators
  sm.bool <- which(varnames %in% stringdist.match)
  stringdist.match <- rep(FALSE, length(varnames))
  if(length(sm.bool) > 0){
    stringdist.match[sm.bool] <- TRUE
  }
  
  nm.bool <- which(varnames %in% numeric.match)
  numeric.match <- rep(FALSE, length(varnames))
  if(length(nm.bool) > 0){
    numeric.match[nm.bool] <- TRUE
  }
  
  pm.bool <- which(varnames %in% partial.match)
  partial.match <- rep(FALSE, length(varnames))
  if(length(pm.bool) > 0){
    partial.match[pm.bool] <- TRUE
  }
  
  af.bool <- which(varnames %in% address.field)
  address.field <- rep(FALSE, length(varnames))
  if(length(af.bool) > 0){
    address.field[af.bool] <- TRUE
  }
  
  gf.bool <- which(varnames %in% gender.field)
  gender.field <- rep(FALSE, length(varnames))
  if(length(gf.bool) > 0){
    gender.field[gf.bool] <- TRUE
  }
  
  fn.bool <- which(varnames %in% firstname.field)
  firstname.field <- rep(FALSE, length(varnames))
  if(length(fn.bool) > 0){
    firstname.field[fn.bool] <- TRUE
  }
  
  ## ----------------------------
  ## Calculate agreement patterns
  ## ----------------------------
  cat("Calculating matches for each variable.\n")
  start <- Sys.time()
  gammalist <- vector(mode = "list", length = length(varnames))
  for(i in 1:length(gammalist)){
    if(verbose){
      matchtype <- ifelse(stringdist.match[i], "string-distance", ifelse(numeric.match[i], "numeric", "exact"))
      cat("    Matching variable", varnames[i], "using", matchtype, "matching.\n")
    }
    ## Convert to character
    if(is.factor(dfA[,varnames[i]]) | is.factor(dfB[,varnames[i]])){
      dfA[,varnames[i]] <- as.character(dfA[,varnames[i]])
      dfB[,varnames[i]] <- as.character(dfB[,varnames[i]])
    }
    ## Warn if no variation (except for gender blocking)
    if(!gender.field[i]){
      if(sum(is.na(dfA[,varnames[i]])) == nrow(dfA) | length(unique(dfA[,varnames[i]])) == 1){
        cat(paste("WARNING: You have no variation in dataset A for", varnames[i], "or all observations are missing."))
      }
      if(sum(is.na(dfB[,varnames[i]])) == nrow(dfB) | length(unique(dfB[,varnames[i]])) == 1){
        cat(paste("WARNING: You have no variation in dataset B for", varnames[i], "or all observations are missing."))
      }
    }
    if(sum(dfA[,varnames[i]] %in% dfB[,varnames[i]]) == 0){
      cat(paste0("WARNING: You have no exact matches for ", varnames[i], "."))
    }
    ## Get patterns
    if(stringdist.match[i]){
      if(partial.match[i]){
        gammalist[[i]] <- gammaCKpar_JBC( #JBC FUNCTION!
          dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, cut.p = cut.p, method = stringdist.method, w = jw.weight, n.cores = n.cores
        )
      }else{
        gammalist[[i]] <- gammaCK2par(dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a, method = stringdist.method, w = jw.weight, n.cores = n.cores)
      }
    }else if(numeric.match[i]){
      if(partial.match[i]){
        gammalist[[i]] <- gammaNUMCKpar(
          dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a.num, cut.p = cut.p.num, n.cores = n.cores
        )
      }else{
        gammalist[[i]] <- gammaNUMCK2par(
          dfA[,varnames[i]], dfB[,varnames[i]], cut.a = cut.a.num, n.cores = n.cores
        )
      }
    }else{
      gammalist[[i]] <- gammaKpar(dfA[,varnames[i]], dfB[,varnames[i]], gender = gender.field[i], n.cores = n.cores)
    }
  }
  end <- Sys.time()
  if(verbose){
    cat("Calculating matches for each variable took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
  }
  
  ### --- JBC ADDED!!! --- 
  # save output (step 1)
  save(gammalist, 
       file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step1_",attempt_name,".RData"))
  ### ---
  
  
  ## Get row numbers
  nr_a <- nrow(dfA)
  nr_b <- nrow(dfB)
  
  ## ------------------------------
  ## Get counts for zeta parameters
  ## ------------------------------
  cat("Getting counts for parameter estimation.\n")
  start <- Sys.time()
  counts <- tableCounts(gammalist, nobs.a = nr_a, nobs.b = nr_b, n.cores = n.cores)
  end <- Sys.time()
  if(verbose){
    cat("Getting counts for parameter estimation took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
  }
  
  ### --- JBC ADDED!!! --- 
  # save output (step 2)
  save(counts, 
       file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step2_",attempt_name,".RData"))
  ### ---
  
  ## ------------------------------
  ## Run or impute the EM algorithm
  ## ------------------------------
  if(is.null(em.obj)){
    ## Run EM algorithm
    cat("Running the EM algorithm.\n")
    start <- Sys.time()
    if(is.null(priors.obj)){
      lambda.prior <- NULL
      pi.prior <- NULL
    }else{
      if("lambda.prior" %in% names(priors.obj)){
        lambda.prior <- priors.obj$lambda.prior
      }
      if("pi.prior" %in% names(priors.obj)){
        if(!("lambda.prior" %in% names(priors.obj))){
          stop("Must specify a prior for lambda if providing a prior for pi.")
        }
        pi.prior <- priors.obj$pi.prior
      }else{
        pi.prior <- NULL
      }
    }
    if(cond.indep == FALSE){
      resultsEM <- emlinklog(patterns = counts, nobs.a = nr_a, nobs.b = nr_b,
                             tol = tol.em, varnames = varnames)  
    }else{
      resultsEM <- emlinkMARmov(patterns = counts, nobs.a = nr_a, nobs.b = nr_b,
                                tol = tol.em,
                                prior.lambda = lambda.prior, w.lambda = w.lambda,
                                prior.pi = pi.prior, w.pi = w.pi,
                                address.field = address.field, 
                                gender.field = gender.field,
                                varnames = varnames)
    }
    end <- Sys.time()
    if(verbose){
      cat("Running the EM algorithm took", round(difftime(end, start, units = "secs"), 2), "seconds.\n\n")
    }
  }else{
    cat("Imputing matching probabilities using provided EM object.\n")
    resultsEM <- emlinkRS(counts, em.obj, nr_a, nr_b)
  }
  
  if(max(resultsEM$zeta.j) < threshold.match) {
    warning(paste0("No matches found for the threshold value used. We recommend trying a lower threshold.match value. Note that you currently have threshold.match set to ", threshold.match, "."))
  }
  
  ### --- JBC ADDED!!! --- 
  # save output (step 3)
  save(resultsEM, 
       file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step3_",attempt_name,".RData"))
  ### ---
  
  ## -----------------------------------------------
  ## Get the estimated matches, dedupe, and reweight
  ## -----------------------------------------------
  if(!estimate.only){
    
    ## Get matches
    cat("Getting the indices of estimated matches.\n")
    start <- Sys.time()
    matches <- matchesLink(gammalist, nobs.a = nr_a, nobs.b = nr_b,
                           em = resultsEM, thresh = threshold.match,
                           n.cores = n.cores)
    end <- Sys.time()
    if(verbose){
      cat("Getting the indices of estimated matches took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
    }
    
    ### --- JBC ADDED!!! --- 
    # save output (step 4)
    save(matches, 
         file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step4_",attempt_name,".RData"))
    ### ---
    
    ## Get the patterns
    patterns <- getPatterns(matchesA = dfA[matches$inds.a, ], matchesB = dfB[matches$inds.b, ],
                            varnames = varnames, stringdist.match = stringdist.match,
                            numeric.match = numeric.match, partial.match = partial.match,
                            stringdist.method = stringdist.method,
                            cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight,
                            cut.a.num = cut.a.num, cut.p.num = cut.p.num)
    
    ## Run deduplication
    if(dedupe.matches & length(matches$inds.a) > 0){
      cat("Deduping the estimated matches.\n")
      start <- Sys.time()
      ddm.out <- dedupeMatches(matchesA = dfA[matches$inds.a,], matchesB = dfB[matches$inds.b,],
                               EM = resultsEM, matchesLink = matches, patterns = patterns,
                               linprog = linprog.dedupe)
      matches <- ddm.out$matchesLink
      resultsEM <- ddm.out$EM
      end <- Sys.time()
      if(verbose){
        cat("Deduping the estimated matches took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
      }
      
      ### --- JBC ADDED!!! --- 
      # save output (step 5)
      save(ddm.out, 
           file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step5_",attempt_name,".RData"))
      ### ---
      
    }else if(length(matches$inds.a) > 0){
      cat("Calculating the posterior for each pair of matched observations.\n")
      start <- Sys.time()
      zeta <- getPosterior(dfA[matches$inds.a,], dfB[matches$inds.b,], EM = resultsEM,
                           patterns = patterns)
      end <- Sys.time()
      if(verbose){
        cat("Calculating the posterior for each matched pair took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
      }
    }
    
    ## Get the patterns
    cat("Getting the match patterns for each estimated match.\n")
    start <- Sys.time()
    patterns <- getPatterns(matchesA = dfA[matches$inds.a, ], matchesB = dfB[matches$inds.b, ],
                            varnames = varnames, stringdist.match = stringdist.match,
                            numeric.match = numeric.match, partial.match = partial.match,
                            stringdist.method = stringdist.method,
                            cut.a = cut.a, cut.p = cut.p, jw.weight = jw.weight,
                            cut.a.num = cut.a.num, cut.p.num = cut.p.num)
    end <- Sys.time()
    if(verbose){
      cat("Getting the match patterns for each estimated match took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
    }
    
    ## Reweight first names or get zeta
    if(reweight.names & length(matches$inds.a) > 0){
      cat("Reweighting match probabilities by frequency of occurrence.\n")
      start <- Sys.time()
      rwn.out <- nameReweight(dfA, dfB, EM = resultsEM, gammalist = gammalist, matchesLink = matches,
                              varnames = varnames, firstname.field = firstname.field,
                              patterns = patterns, threshold.match = threshold.match, n.cores = n.cores)
      end <- Sys.time()
      if(verbose){
        cat("Reweighting by first name took", round(difftime(end, start, units = "mins"), 2), "minutes.\n\n")
      }
    }
    
    ## Return object
    out <- list()
    if(return.df){
      out[["dfA.match"]] <- dfA[matches$inds.a,]
      out[["dfB.match"]] <- dfB[matches$inds.b,]
    }
    out[["matches"]] <- matches
    out[["EM"]] <- resultsEM
    out[["patterns"]] <- patterns
    if(dedupe.matches & length(matches$inds.a) > 0){
      out[["posterior"]] <- ddm.out$max.zeta
    }else if(length(matches$inds.a) > 0){
      out[["posterior"]] <- zeta
    }
    if(reweight.names & length(matches$inds.a) > 0){
      out[["posterior"]] <- rwn.out
    }
    out[["nobs.a"]] <- nr_a
    out[["nobs.b"]] <- nr_b
    if(return.all){
      class(out) <- c("fastLink", "confusionTable")
    }else{
      class(out) <- "fastLink"
    }
    if(dedupe.df){
      class(out) <- c(class(out), "fastLink.dedupe")
    }
  }else{
    out <- resultsEM
  }
  
  ### --- JBC ADDED!!! --- 
  # save output 
  save(out, 
       file=paste0(file_path_data_matched,"/State_VF_Match_Steps/",ABB,"_Step6_",attempt_name,".RData"))
  ### ---
  
  
  ### --- JBC ADDED!!! --- 
  # Saves copy of summary output to "Output" subfolder
  utils::capture.output(summary(out), file = paste0(temp_path_o,ABB,"_",attempt_name,".txt"))
  # Save matches as tibbles (aka the faster dplyr version of a data frame) 
  TFA_matches <- dfA[out$matches$inds.a,] %>% as_tibble()
  VF_matches <- dfB[out$matches$inds.b,] %>% as_tibble()
  # Coerce all character vars in matched VF to factors (deals with the mising string vars)
  VF_matches %<>% mutate_all(as.factor) 
  # Export
  save(TFA_matches, file=paste0(temp_path_d,ABB,"_",attempt_name,"_TFA.RData"))
  save(VF_matches, file=paste0(temp_path_d,ABB,"_",attempt_name,"_VF.RData"))
  ### ---
  
  # return(out) # JBC COMMENTED OUT!!!
  
}




